ref

import pandas as pd

import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from autogluon.tabular import TabularDataset, TabularPredictor


df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df.head()

/tmp/ipykernel_3182735/3046116532.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])

	Unnamed: 0	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long
669418	669418	2019-10-12 18:21	4.089100e+18	fraud_Haley, Jewess and Bechtelar	shopping_pos	7.53	Debra	Stark	F	686 Linda Rest	...	32.3836	-94.8653	24536	Multimedia programmer	1983-10-14	d313353fa30233e5fab5468e852d22fc	1350066071	32.202008	-94.371865
32567	32567	2019-01-20 13:06	4.247920e+12	fraud_Turner LLC	travel	3.79	Judith	Moss	F	46297 Benjamin Plains Suite 703	...	39.5370	-83.4550	22305	Television floor manager	1939-03-09	88c65b4e1585934d578511e627fe3589	1327064760	39.156673	-82.930503
156587	156587	2019-03-24 18:09	4.026220e+12	fraud_Klein Group	entertainment	59.07	Debbie	Payne	F	204 Ashley Neck Apt. 169	...	41.5224	-71.9934	4720	Broadcast presenter	1977-05-18	3bd9ede04b5c093143d5e5292940b670	1332612553	41.657152	-72.595751
1020243	1020243	2020-02-25 15:12	4.957920e+12	fraud_Monahan-Morar	personal_care	25.58	Alan	Parsons	M	0547 Russell Ford Suite 574	...	39.6171	-102.4776	207	Network engineer	1955-12-04	19e16ee7a01d229e750359098365e321	1361805120	39.080346	-103.213452
116272	116272	2019-03-06 23:19	4.178100e+15	fraud_Kozey-Kuhlman	personal_care	84.96	Jill	Flores	F	639 Cruz Islands	...	41.9488	-86.4913	3104	Horticulturist, commercial	1981-03-29	a0c8641ca1f5d6e243ed5a2246e66176	1331075954	42.502065	-86.732664

5 rows × 23 columns

총 265,342건 거래 중 7,506건(2,83%)가 사기

이분그래프

- 이분그래프

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G
G_bu = build_graph_bipartite(df, nx.Graph(name="Bipartite Undirect"))

삼분그래프

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G

G_tu = build_graph_tripartite(df, nx.Graph())

지도학습(이분그래프)


from sklearn.utils import resample

df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]

df_maj_dowsampled = resample(df_majority,
                             n_samples=len(df_minority),
                             random_state=42)

df_downsampled = pd.concat([df_minority, df_maj_dowsampled])

print(df_downsampled.is_fraud.value_counts())
G_down = build_graph_bipartite(df_downsampled)

1    6006
0    6006
Name: is_fraud, dtype: int64

from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)

edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00,  2.57it/s]

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

np.array(train_embeddings).shape

(9351, 128)

np.array(train_embeddings)

array([[4.0369573e-01, 2.0337313e-01, 2.1946652e-01, ..., 1.7150687e-01,
        3.6633116e-01, 3.2048109e-01],
       [6.0373070e-03, 1.6968289e-01, 8.6983815e-03, ..., 2.2079267e-01,
        3.2768153e-02, 2.3883855e-02],
       [9.2083057e-03, 1.8300842e-02, 8.2615782e-03, ..., 4.4274908e-02,
        2.1799646e-01, 2.3926771e-03],
       ...,
       [1.7281795e-01, 4.6769153e-02, 1.9730711e-01, ..., 6.4412162e-02,
        3.3814883e-01, 2.4217861e-02],
       [6.3609913e-02, 2.2192889e-01, 1.1514757e-04, ..., 9.1643520e-02,
        1.5498386e-02, 2.4329810e-01],
       [2.6236567e-05, 4.7491617e-03, 9.5967706e-03, ..., 1.5650114e-01,
        1.1875462e-02, 9.1554008e-02]], dtype=float32)

np.array(train_labels).shape

(9351,)


# DataFrame 생성
columns = [f'embedding_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)

df_labels = pd.DataFrame(data=train_labels, columns=['label'])

# DataFrame 합치기
df = pd.concat([df_data, df_labels], axis=1)

df

	embedding_0	embedding_1	embedding_2	embedding_3	embedding_4	embedding_5	embedding_6	embedding_7	embedding_8	embedding_9	...	embedding_119	embedding_120	embedding_121	embedding_122	embedding_123	embedding_124	embedding_125	embedding_126	embedding_127	label
0	0.403696	0.203373	0.219467	0.046628	0.056515	0.018408	0.022522	0.080388	0.098083	0.005314	...	0.136802	0.000194	0.063601	0.175472	0.044392	0.010460	0.171507	0.366331	0.320481	0
1	0.006037	0.169683	0.008698	0.141072	0.029439	0.097091	0.005095	0.080614	0.132287	0.017439	...	0.050932	0.008416	0.004334	0.000474	0.000034	0.075918	0.220793	0.032768	0.023884	1
2	0.009208	0.018301	0.008262	0.025849	0.031677	0.000057	0.147312	0.136967	0.002352	0.057455	...	0.008357	0.055791	0.109624	0.000029	0.007875	0.005629	0.044275	0.217996	0.002393	1
3	0.129434	0.036309	0.040281	0.056018	0.138173	0.063305	0.023791	0.021431	0.001766	0.000098	...	0.008713	0.050279	0.028918	0.102740	0.002691	0.000420	0.215788	0.226286	0.014054	1
4	0.055134	0.000257	0.027203	0.406045	0.367124	0.009524	0.000950	0.040553	0.075501	0.123167	...	0.003346	0.004510	0.057712	0.000007	0.181280	0.009843	0.061533	0.023981	0.006037	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9346	0.041404	0.148350	0.128774	0.000489	0.034969	0.010100	0.069104	0.044493	0.130157	0.001190	...	0.510222	0.061467	0.052151	0.142345	0.010240	0.528946	0.089659	0.026390	0.088123	1
9347	0.007367	0.250150	0.424588	0.140796	0.002336	0.052449	0.044327	0.131472	0.035484	0.123832	...	0.003025	0.015720	0.167906	0.013722	0.051255	0.302898	0.042692	0.184819	0.039848	0
9348	0.172818	0.046769	0.197307	0.000106	0.001071	0.024714	0.076101	0.011439	0.326547	0.034753	...	0.087797	0.325244	0.009078	0.566709	0.025226	0.452541	0.064412	0.338149	0.024218	0
9349	0.063610	0.221929	0.000115	0.025852	0.000718	0.126464	0.029603	0.000026	0.003015	0.080191	...	0.000170	0.095050	0.029800	0.004892	0.026643	0.049939	0.091644	0.015498	0.243298	1
9350	0.000026	0.004749	0.009597	0.075472	0.000965	0.038021	0.000005	0.128029	0.041665	0.019990	...	0.263180	0.000784	0.140076	0.111306	0.001472	0.000657	0.156501	0.011875	0.091554	1

9351 rows × 129 columns

label = np.array(train_labels)

predictr = TabularPredictor(label='label')

No path specified. Models will be saved in: "AutogluonModels/ag-20240121_081618/"

predictr.fit(df)

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240121_081618/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   628.19 GB / 982.82 GB (63.9%)
Train Data Rows:    9351
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [0, 1]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    39623.22 MB
    Train Data (Original)  Memory Usage: 4.79 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['embedding_0', 'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['embedding_0', 'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4', ...]
    0.2s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 4.79 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.24s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8415, Val Rows: 936
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9e5e0188b0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.4765   = Validation score   (accuracy)
    0.86s    = Training   runtime
    0.08s    = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9e5e0188b0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.4765   = Validation score   (accuracy)
    0.86s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: LightGBMXT ...
    0.7147   = Validation score   (accuracy)
    1.73s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBM ...
    0.7137   = Validation score   (accuracy)
    1.6s     = Training   runtime
    0.0s     = Validation runtime
Fitting model: RandomForestGini ...
    0.7286   = Validation score   (accuracy)
    2.58s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.7436   = Validation score   (accuracy)
    3.05s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: CatBoost ...
    0.7265   = Validation score   (accuracy)
    4.75s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini ...
    0.7276   = Validation score   (accuracy)
    1.22s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.7361   = Validation score   (accuracy)
    1.24s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 5: early stopping
    0.7329   = Validation score   (accuracy)
    5.93s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: XGBoost ...
    0.7009   = Validation score   (accuracy)
    1.69s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.7254   = Validation score   (accuracy)
    3.74s    = Training   runtime
    0.06s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.7179   = Validation score   (accuracy)
    4.37s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.7479   = Validation score   (accuracy)
    0.51s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 35.14s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240121_081618/")

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f9e5e0fffa0>

predictr.leaderboard()

                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.747863       0.117266  18.046104                0.001305           0.510860            2       True         14
1      RandomForestEntr   0.743590       0.033528   3.046299                0.033528           3.046299            1       True          6
2        ExtraTreesEntr   0.736111       0.033238   1.237712                0.033238           1.237712            1       True          9
3       NeuralNetFastAI   0.732906       0.012168   5.929674                0.012168           5.929674            1       True         10
4      RandomForestGini   0.728632       0.033677   2.584781                0.033677           2.584781            1       True          5
5        ExtraTreesGini   0.727564       0.032846   1.223976                0.032846           1.223976            1       True          8
6              CatBoost   0.726496       0.003742   4.750515                0.003742           4.750515            1       True          7
7        NeuralNetTorch   0.725427       0.058065   3.735725                0.058065           3.735725            1       True         12
8         LightGBMLarge   0.717949       0.007159   4.371737                0.007159           4.371737            1       True         13
9            LightGBMXT   0.714744       0.004040   1.732927                0.004040           1.732927            1       True          3
10             LightGBM   0.713675       0.002815   1.598863                0.002815           1.598863            1       True          4
11              XGBoost   0.700855       0.005022   1.693087                0.005022           1.693087            1       True         11
12       KNeighborsDist   0.476496       0.022600   0.860941                0.022600           0.860941            1       True          2
13       KNeighborsUnif   0.476496       0.084483   0.864856                0.084483           0.864856            1       True          1

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	WeightedEnsemble_L2	0.747863	0.117266	18.046104	0.001305	0.510860	2	True	14
1	RandomForestEntr	0.743590	0.033528	3.046299	0.033528	3.046299	1	True	6
2	ExtraTreesEntr	0.736111	0.033238	1.237712	0.033238	1.237712	1	True	9
3	NeuralNetFastAI	0.732906	0.012168	5.929674	0.012168	5.929674	1	True	10
4	RandomForestGini	0.728632	0.033677	2.584781	0.033677	2.584781	1	True	5
5	ExtraTreesGini	0.727564	0.032846	1.223976	0.032846	1.223976	1	True	8
6	CatBoost	0.726496	0.003742	4.750515	0.003742	4.750515	1	True	7
7	NeuralNetTorch	0.725427	0.058065	3.735725	0.058065	3.735725	1	True	12
8	LightGBMLarge	0.717949	0.007159	4.371737	0.007159	4.371737	1	True	13
9	LightGBMXT	0.714744	0.004040	1.732927	0.004040	1.732927	1	True	3
10	LightGBM	0.713675	0.002815	1.598863	0.002815	1.598863	1	True	4
11	XGBoost	0.700855	0.005022	1.693087	0.005022	1.693087	1	True	11
12	KNeighborsDist	0.476496	0.022600	0.860941	0.022600	0.860941	1	True	2
13	KNeighborsUnif	0.476496	0.084483	0.864856	0.084483	0.864856	1	True	1

test = np.array(test_embeddings)

test.shape

(2338, 128)

columns = [f'embedding_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

# DataFrame 확인
print(test_df.head())

   embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
0     0.013685     0.001803     0.025815     0.352866     0.377978   
1     0.002213     0.476018     0.174582     0.164626     0.026744   
2     0.033581     0.123362     0.004575     0.037874     0.006698   
3     0.022749     0.064797     0.000525     0.218562     0.002898   
4     0.000060     0.007555     0.001758     0.257433     0.096365   

   embedding_5  embedding_6  embedding_7  embedding_8  embedding_9  ...  \
0     0.007875     0.041015     0.035619     0.091952     0.395759  ...   
1     0.013961     0.220435     0.512335     0.052540     0.000499  ...   
2     0.050513     0.080717     0.059508     0.039344     0.380898  ...   
3     0.006643     0.136263     0.028495     0.027063     0.027161  ...   
4     0.075485     0.234585     0.037758     0.006236     0.000702  ...   

   embedding_118  embedding_119  embedding_120  embedding_121  embedding_122  \
0       0.017987       0.317063       0.191938       0.130174       0.159880   
1       0.000210       0.163785       0.001970       0.013025       0.090202   
2       0.176072       0.031670       0.134932       0.022356       0.065149   
3       0.000395       0.022916       0.014586       0.101680       0.084287   
4       0.036900       0.047368       0.093696       0.014713       0.091163   

   embedding_123  embedding_124  embedding_125  embedding_126  embedding_127  
0       1.074022       0.009432       0.037776       0.009084       0.306696  
1       0.293918       0.167935       0.027035       0.032836       0.001162  
2       0.006532       0.124003       0.188896       0.164872       0.003926  
3       0.246152       0.072217       0.357974       0.016832       0.121802  
4       0.075833       0.017246       0.928035       0.111811       0.025898  

[5 rows x 128 columns]

predictr.predict(test_df).mean()

0.0290846877673225

y = test_labels

yhat = predictr.predict(test_df)

# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluation(y, yhat):
        metrics = [sklearn.metrics.accuracy_score,
                   sklearn.metrics.precision_score,
                   sklearn.metrics.recall_score,
                   sklearn.metrics.f1_score,
                   sklearn.metrics.roc_auc_score]
        return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})

evaluation(y,yhat)

	accuracy_score	precision_score	recall_score	f1_score	roc_auc_score
0	0.512404	0.691176	0.040309	0.076175	0.511195

# from sklearn.ensemble import RandomForestClassifier 
# from sklearn import metrics 

# classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
# for cl in classes:
#     embeddings_train = cl(keyed_vectors=model_train.wv) 

#     train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
#     test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
    
#     rf = RandomForestClassifier(n_estimators=1000, random_state=42) 
#     rf.fit(train_embeddings, train_labels); 
#     #X=train_embeddings
#     #y=train_labels
#     #df=[X,y]
#     # predictr = TabularPredictor(label='train_labels')
#     # predictr.fit(df) 

#     y_pred = rf.predict(test_embeddings)
#     print(cl)
#     print('Precision:', metrics.precision_score(test_labels, y_pred)) 
#     print('Recall:', metrics.recall_score(test_labels, y_pred)) 
#     print('F1-Score:', metrics.f1_score(test_labels, y_pred))